#### packages ####

library(tidyverse)
library(stringr)
library(httr)
library(urltools)
library(quanteda)
library(lubridate)
library(scales)
library(cowplot)
library(quanteda)
library(stm)
library(stargazer)

theme_set(theme_bw())


##########################
#### data preparation ####
##########################

#### read in data set ####

d <-read_tsv('yt_1000_de_processed.tsv')

#### restrict data set to top 100 channels ####

top100 <- d %>% group_by(channel_id) %>%
  summarise(subscriptions = max(channel_subscriptions),
            views = sum(as.numeric(video_views), na.rm = T),
            name = max(channel_name)) %>% 
  arrange(desc(subscriptions)) %>% 
  head(100) %>% 
  pull(name)

df <- d %>% filter(channel_name %in% top100)

#### get url info ####

# this code will call all detected urls inside video descriptions
# as some of the urls might not be available at later times
# the important variables are already contained in the dataset.

url_ex <- function(input_string) {
  # regular expression to extract all links
  urls <- str_extract_all(input_string, 
                          pattern = 'http\\S+\\s*', 
                          simplify = T) %>% 
    str_trim('right')
  return(urls)
}

u_agent <- 'Insert your agent string here'
get_ext_urls <- function(x){
  # function to open (short) link and retrieve the full url
  for (i in seq_along(x)) {
    
    call_ <-  try(GET(x[i], user_agent(u_agent), timeout(4)), silent = TRUE )
    
    if( class(call_) == 'try-error') {
      write(x[i],"yt_errors.txt",
            append = TRUE)
      next
    }
    
    if (is.null(call_)) {
      write(x[i],"yt_errors.txt",
            append = TRUE)
      Sys.sleep(0.5)
    }
    else {
      url_ <- call_$url
      status_code <- call_$status_code
      line <- c(x[i], url_, status_code) %>% 
        str_c(collapse = ',')
      write(line,file="yt_links.txt", append = TRUE)
      Sys.sleep(0.5)
    }
  }
}

vid_urls <- map(df_100$video_description, url_ex)
all_urls <- flatten_chr(vid_urls) %>% unique() %>% str_trim('right')


get_ext_urls(all_urls) 


#### parse url info ####

# this code parses urls and detects corresponding domains

links <- read_csv('yt_links.txt', 
                  col_names = FALSE)
colnames(links) <- c('start_url', 'parsed_url', 'status_code')
errors = read_csv('yt_errors.txt', 
                  col_names = FALSE) %>%  na.omit()
colnames(errors) <- c('start_url')

parsed <- urltools::url_parse(links$parsed_url)
parsed <- bind_cols(parsed, links)
parsed$retrieved <- TRUE

parsed_e <- urltools::url_parse(errors$start_url)
parsed_e <- bind_cols(parsed_e, errors)
parsed_e$retrieved <- FALSE

parsed <- bind_rows(parsed,parsed_e) %>% select(-port, -fragment)

#### find the most frequent urls and merge manual codings ####

top50_domains <- parsed %>% group_by(domain) %>% 
  summarise(obs = n()) %>% 
  arrange(desc(obs)) %>% head(50)
sum(top50_domains$obs) / nrow(parsed) # 83.32% of all links, fair enough 

ref_cat <- read_tsv('top50_domains.tsv')

parsed_ref <- left_join(parsed, ref_cat, by = 'domain')
parsed_ref$ref<- ifelse(is.na(parsed_ref$ref), 0, parsed_ref$ref)

#### use hash lookup to apply coding to all detected links ####

hash_dic <- new.env(hash = TRUE)  # equivalent to new.env()
#hash function to find the links quickly:

list2env(
  setNames(
    as.list(parsed_ref$ref), 
    parsed_ref$start_url
  ),
  envir = hash_dic
)

lookup <- function(x) {
  # function to look up urls in video urls
  result <- try(get(x, envir = hash_dic), silent = TRUE)
  if(class(result) == 'try-error') {
    result <- as.integer(0)
  }
  return(result)
}

counter <- vector(length = length(vid_urls), mode = 'integer')
# empty vector for counts of reflinks

for (i in seq_along(vid_urls)) {
  #iterating through all urls and checking whether it is a reflink
  counter[i] <- map(vid_urls[[i]], lookup) %>% flatten_int() %>% sum(na.rm = T)
}

#### create variables ####
 
#### filter years and create date variables ####

df <- df %>% 
  filter(video_date > ymd('2008-12-31') & 
           video_date < ymd('2017-06-01')) %>% 
  arrange(video_date) %>% 
  mutate(year_month = format(video_date, "%Y-%m"),
         year_month_d = as.Date(parse_date_time(year_month, '%Y-%m')),
         date_num = as.numeric(factor(year_month)))


#######################
#### data analysis ####
#######################


#### descriptives ####

# table 1

summary <- df %>% mutate(duration = video_duration / 60) %>% 
  select(video_views, video_likes, video_dislikes, duration)  %>%
  as.data.frame()

summary2 <- df %>% group_by(channel_name) %>% 
  summarise(n_videos = n(), subscribers = first(channel_subscriptions)) %>% 
  # months_present = max(date_num) - min(date_num)) %>% 
  as.data.frame()

stargazer(summary, 
          type = 'latex', median = TRUE, summary = TRUE,
          digits = 2, omit.summary.stat = 'n')
stargazer(summary2, type = 'latex', median = TRUE, summary = TRUE,
          digits = 2, omit.summary.stat = 'n')

# figure 2

reorder_size <- function(x) {
  factor(x, levels = names(sort(table(x))))
}

df$video_year <- as.factor(df$video_year)

df %>% ggplot(aes(reorder_size(channel_category2))) + 
  geom_bar() + coord_flip()

c1 <- df %>% ggplot(aes(video_year)) +  geom_bar() + coord_flip() + 
  scale_y_continuous(breaks = pretty_breaks(n = 8), 
             label = unit_format(unit = "K", scale = 1e-3), expand = c(0,0)) + 
  labs(x = 'Year', y =  'Number of video uploads') + 
  theme( panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                         colour = "grey90"))

c2 <- df %>% ggplot(aes(reorder_size(channel_category2))) +  
  geom_bar() + coord_flip() + 
  scale_y_continuous(breaks = pretty_breaks(n = 8),
             label = unit_format(unit = "K", scale = 1e-3), expand = c(0,0)) + 
  labs(x = 'Channel category', y =  'Number of video uploads') + 
  theme( panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                         colour = "grey90"))


j <-  plot_grid(c1, c2,  ncol = 2,
                labels = c('A', 'B'), hjust = -0.1)
j

ggsave('descriptive_counts.png', width = 9, height = 6, dpi = 300, plot = j)


#### affiliate links ####

# figure 3

p1 <- df %>% 
  ggplot(aes(video_date, video_refcount)) + geom_smooth() +
  scale_x_date(limits = c(ymd('2009-01-01'), ymd('2017-05-01')),
               date_breaks = '6 months',
               minor_breaks = NULL,
               date_labels = '%Y-%m',
               expand = c(0,0)) +
  scale_y_continuous(minor_breaks = NULL, limits = c(0, NA),
                     expand = c(0,0)) +
  theme(axis.text.x=element_text(angle=90, hjust=1),
        panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                        colour = "grey90")) +
  scale_color_brewer(palette = 'Set1') +
  scale_fill_brewer(palette = 'Set1') + labs(y = '' , x = 'Date')


p2 <- df %>% 
  ggplot(aes(video_date, video_refcount, 
             color = channel_category2,
             fill = channel_category2)) + geom_smooth() +
  scale_x_date(limits = c(ymd('2009-01-01'), ymd('2017-05-01')),
               date_breaks = '6 months',
               minor_breaks = NULL,
               date_labels = '%Y-%m',
               expand = c(0,0)) +
  scale_y_continuous(minor_breaks = NULL, limits = c(0, NA),
                     expand = c(0,0)) +
  theme(axis.text.x=element_text(angle=90, hjust=1),
        panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                        colour = "grey90"),
        legend.position = c(0.3, 0.7)) +
  scale_color_brewer(palette = 'Set1') + scale_fill_brewer(palette = 'Set1') +
  labs(y = 'Number of referral links per video', x = 'Date',
       color = 'Channel Category', fill = 'Channel Category')


g <- plot_grid(p1, p2,  ncol = 2,
               labels = c('A', 'B'), hjust = -0.1)
g

ggsave('sellout_links.png', width = 9, height = 6, dpi = 300, plot = g)

#### share of reflinks ####

# appendix 2

r1 <- df %>% 
  ggplot(aes(video_date, video_refshare)) + geom_smooth() +
  theme_bw(base_size =10 ) +
  scale_x_date(limits = c(ymd('2009-01-01'), ymd('2017-05-01')),
               date_breaks = '6 months',
               minor_breaks = NULL,
               date_labels = '%Y-%m',
               expand = c(0,0)) +
  scale_y_continuous(minor_breaks = NULL, limits = c(0, NA),
                     expand = c(0,0),
                     breaks = pretty_breaks(n=5)) +
  theme(axis.text.x=element_text(angle=90, hjust=1),
        panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                        colour = "grey90")) +
  scale_color_brewer(palette = 'Set1') +
  labs(y = 'Share of referral links per video', x = 'Date') 

r2 <- df %>% 
  ggplot(aes(video_date, video_linkcount)) + geom_smooth() +
  theme_bw(base_size =10 ) +
  scale_x_date(limits = c(ymd('2009-01-01'), ymd('2017-05-01')),
               date_breaks = '6 months',
               minor_breaks = NULL,
               date_labels = '%Y-%m',
               expand = c(0,0)) +
  scale_y_continuous(minor_breaks = NULL, limits = c(0, NA),
                     expand = c(0,0),
                     breaks = pretty_breaks(n=5)) +
  theme(axis.text.x=element_text(angle=90, hjust=1),
        panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                        colour = "grey90")) +
  scale_color_brewer(palette = 'Set1') +  
  labs(y = 'Total number of links per video', x = 'Date') 

h <- plot_grid(r2, r1,  ncol = 2,
               labels = c('A', 'B'), hjust = -0.1)
h
ggsave('links_selloutshare.png',
       width = 9, height = 6, dpi = 300, plot = h)

#################################
#### structural topic models ####
#################################

# read captions
subs <- read_tsv('yt_100_de_autosubs.tsv')
df <- df %>% left_join(subs, by = 'video_id')



# creating document term matrix
df_stm <- df %>% 
  select(video_autosubs, date_num, year_month, year_month_d, video_year, 
         channel_category2, channel_name, video_linkcount, video_refcount,
         video_url, video_views, video_comments, video_likes, video_dislikes)

dfm_stm  <- dfm(df_stm$video_autosubs, stem = FALSE, 
                remove = c(stopwords('german'), stopwords('english')),
                remove_url = TRUE,
                remove_punct = TRUE,
                verbose = TRUE)
# trimming vocab
dfm_stm2 <- dfm_trim(dfm_stm,
             min_docfreq = 500, # terms occuring in at least 500 videos
             max_docfreq = 0.25) # terms occuring in at max 25% of all documents
dim(dfm_stm2)

# prepare for stm model
out <- convert(dfm_stm2, to = 'stm', 
               docvars = df_stm)
dim(out$meta) 
length(out$vocab)

# the R app stminsights was used to inspect and validate the models below
# https://github.com/methodds/stminsights

# running stm models

stm_025 <- stm(out$documents, out$vocab, data = out$meta,
               init.type = 'Spectral',
               prevalence =~ channel_category2 + date_num,
               K = 25, verbose = FALSE)
effect_stm_025 <- estimateEffect(1:25 ~ channel_category2 + date_num,
                                 stm_025, metadata = out$meta)


stm_050<- stm(out$documents, out$vocab, data = out$meta,
              init.type = 'Spectral',
              prevalence =~ channel_category2 + date_num,
              K = 50, verbose = FALSE)
effect_stm_050 <- estimateEffect(1:50 ~ channel_category2 + date_num,
                                 stm_050, metadata = out$meta)


stm_075 <- stm(out$documents, out$vocab, data = out$meta,
               init.type = 'Spectral',
               prevalence =~ channel_category2 + date_num,
               K = 75, verbose = FALSE)
effect_stm_075 <- estimateEffect(1:75 ~ channel_category2 + date_num,
                                 stm_075, metadata = out$meta)


stm_100 <- stm(out$documents, out$vocab, data = out$meta,
               init.type = 'Spectral',
               prevalence =~ channel_category2 + date_num,
               K = 100)
effect_stm_100 <- estimateEffect(1:100 ~ channel_category2 + date_num,
                                 stm_100, metadata = out$meta)

########################
# model diagnostics ####
########################

# these functions are only included for replication purposes
# I recommend using stminsights for diagnostics instead
# https://github.com/methodds/stminsights

models <-   Filter(function(x) 'STM' %in% class( get(x) ), ls() )

stm_quality <- function(models) {
  len_models <- length(models)
  
  df <- tibble(exclusivity = numeric(), 
               coherence = numeric(),
               statistic = character(), 
               topics = numeric())
  
  for (model in seq_along(1:len_models)) {
    
    
    stm_obj <- get(models[model])
    
    exclusivity_med <- median(exclusivity(stm_obj)) 
    coherence_med <- median(semanticCoherence(stm_obj, 
                                              out$documents))
    exclusivity_mean <- mean(exclusivity(stm_obj)) 
    coherence_mean <- mean(semanticCoherence(stm_obj, 
                                             out$documents))
    nr_topics_m <- ncol(stm_obj$theta)
    
    model_df_mean <- tibble(
      exclusivity = exclusivity_mean, 
      coherence = coherence_mean,
      statistic = 'mean', 
      topics = nr_topics_m)
    
    model_df_median <- tibble(
      exclusivity = exclusivity_med, 
      coherence = coherence_med,
      statistic = 'median', 
      topics = nr_topics_m)
    
    df <- bind_rows(df, model_df_mean, model_df_median)
    
  }
  df$topics <- as.factor(df$topics)
  return(df)
}

stm_diag <- stm_quality(models)

# plotting model diagnostics - appendix 3
stm_diag %>% 
  ggplot(aes(y = exclusivity, x = coherence, 
             color = statistic, shape = topics)) +
  geom_point(size = 1.5) + 
  geom_text(aes(x = coherence + 1.5, label = topics, size = 1.5),
                                     show.legend = FALSE) +
  scale_color_brewer(palette = 'Set1') +theme_light() 

ggsave('yt100_stm_quality.png',   width = 8, height = 6, dpi = 300)

#######################
# stm effect plots ####
#######################

# these functions are only included for replication purposes
# I recommend using stminsights for effect plots instead
# https://github.com/methodds/stminsights


# create helper to convert numerics back to date 
datehelper <- out$meta %>% group_by(date_num) %>% 
  summarise(date = first(year_month_d))
date_dic <- new.env(hash = TRUE)  
list2env(setNames( as.list(datehelper$date),
                   datehelper$date_num) ,
         envir = date_dic)

change_date <- function(x)  {
  date_dic[[as.character(x)]] %>% as.character() %>% 
    str_sub(1, 4)
}


# function for continuous effect plot
plot_continuous <- function(estobj, variable, topic,
                            xlab = variable, ylab = 'Topic Proportion',
                            ci = 0.95) {
  data <- plot(x = estobj, covariate = variable, topic = topic, 
               method = 'continuous',
               ci.level = ci, omit.plot = TRUE)
  means <- tibble(mean  = data$means[[1]], value = data$x)
  cis <- t(data$ci[[1]])
  colnames(cis) <- c('lower', 'upper')
  comb <- cbind(means, cis)
  comb$lower <- ifelse(comb$lower < 0, 0, comb$lower)
  comb$mean <- ifelse(comb$mean < 0, 0, comb$mean)
  
  ggplot(comb,aes(x = value, y = mean)) + 
    geom_line(color ='#396afc', size = 1.1) +
    geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = 'grey60') +
    scale_x_continuous(breaks = c(20, 32, 44, 56, 68, 80, 92, 104, 116),
               labels =map(c(20, 32, 44, 56, 68, 80, 92, 104, 116), change_date),
               expand = c(0, 0)) +
    scale_y_continuous(breaks = pretty_breaks(n=8), expand = c(0.00, 0),
                       labels = percent) +
    guides(fill=F, color = F, group = F)  +  
    labs(x = xlab , y  = ylab) + 
    theme( panel.grid.minor = element_blank(),
           panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                           colour = "grey90"))
  
}

# function for pointestimate effect plot
plot_pointestimate <- function(estobj, variable, topic,
                               xlab = variable, ylab = 'Topic proportion',
                               ci = 0.95) {
  data <- plot(x = estobj, covariate = variable, topic = topic, 
               method = 'pointestimate', ci.level =ci, omit.plot = TRUE)
  means <- tibble(mean  = data$means[[1]], value = data$uvals,
                  labels = data$labels)
  cis <- t(data$cis[[1]])
  colnames(cis) <- c('lower', 'upper')
  comb <- cbind(means, cis)
  comb$value <- as.factor(comb$value)
  
  ggplot(comb, aes(y = reorder(value, mean), x = mean)) + 
    geom_point(size = 1.0) + 
    geom_line() +  guides(fill=F, color = F, group = F)  +  
    geom_errorbarh(aes(xmin = lower, xmax = upper),
                   height = 0.1) +
    scale_x_continuous(labels = percent,
                       breaks = pretty_breaks(n=8)) + 
    labs(x = ylab , y  = xlab) +
    theme( panel.grid.minor = element_blank(),
           panel.grid.major.x = element_blank(),
           panel.grid.major = element_line(size = 0.2, linetype = 'solid',
                                           colour = "grey90")) +
    coord_flip()
  
}

# figure 4

# plot the effects for the 'sellout' topic 1
s1 <- plot_continuous(effect_stm_050, 'date_num', 14, xlab = 'Date') + 
  labs(y = '', title = '') + 
  theme(axis.text.x = element_text(angle = 90, hjust = -0.5)) 
s2 <- plot_pointestimate(effect_stm_050, 'channel_category2', 14) + 
  theme(axis.text.x = element_text(angle = 20, hjust = 0.9)) + 
  labs(y = 'Channel category',   x = 'Topic proportion:\nhauls & fashion' )

# plot the effects for the 'sellout' topic 2
s3 <- plot_continuous(effect_stm_050, 'date_num', 38, xlab = NULL) + 
  labs(y = '', title = '') + 
  theme(axis.text.x = element_text(angle = 90, hjust = -0.5)) 
s4 <- plot_pointestimate(effect_stm_050, 'channel_category2', 38) + 
  theme(axis.text.x = element_text(angle = 20, hjust = 0.9)) + 
  labs(y = NULL,   x = 'Topic proportion:\nbeauty & cosmetics' )


# combine and save
v <- plot_grid(s1, s2, s3, s4, ncol = 2,
               labels = c('A', 'B', 'C', 'D'), hjust = -0.1)  

ggsave('stm_sellouteffects_all.png',
       width = 9, height = 6, dpi = 300, plot = v)

#### stm tables ####

# these are the topic labels we assigned
topic_labels <- c(
  'music - rap',
  'cheering',
  'social media promotion',
  'gaming in general ',
  'movement',
  'gaming - minecraft 1',
  'game shows',
  'music - misc ',
  'stop words & english words',
  'traveling',
  'enthusiasm',
  'gaming - sim city',
  'music & parties',
  'products - hauls & fashion',
  'gaming - action games',
  'greetings & acknowledgements',
  'gaming - minecraft 2',
  'food & gaming - pokémon',
  'tech devices',
  'family & ceremonies',
  'abbreviations & stopwords',
  'gaming - minecraft 3',
  'movies & trailers',
  'fantasy',
  'subjects',
  'gaming - ships and space',
  'stopwords',
  'dates & seasons',
  'nature & gaming - minecraft 4',
  'filler words & transcript errors',
  'gaming - minecraft 5',
  'school & relationships',
  'food tasting',
  'digits',
  'music - songs & bands',
  'gaming - binding of isaac',
  'tutorials',
  'products - beauty & cosmetics',
  'stories, people & countries',
  'questions & answers',
  'conversations',
  'body & sports',
  'baking & cooking',
  'numbers & placements',
  'shooting & weapons',
  'gaming - minecraft 6',
  'cars & racing',
  'names & characters',
  'gaming - minecraft 7',
  'comedy reports')


# function to get frex terms for each topic
get_topic_df <- function(x, labels = '', what = 'frex', obs = 15 ) {
  helpfun <- function(x) {paste0(x, collapse = ', ')}
  terms <- t(labelTopics(x, n= obs)[[what]]) %>% as_data_frame() %>% 
    summarise_all(helpfun) %>% transpose() %>% unlist()
  props <- colMeans(stm_050$theta)
  ids <- 1:length(props)
  df <- data_frame(id = ids, topic = labels, 
                   proportion = round(props, 3),  terms = terms)
  return(df %>% arrange(desc(proportion)))
  
}

# frex terms -  table 3
topic_df <- get_topic_df(stm_050, labels = topic_labels)
topic_df$topic <- str_replace_all(topic_df$topic, '&', '/')

topic_df %>% dplyr::select(-id, -terms) %>% 
  stargazer(summary = FALSE, rownames = FALSE)
topic_df %>% dplyr::select(topic, terms) %>% 
  stargazer(summary = FALSE, rownames = FALSE)

# function for topic proportions
topic_props <- function(proportions) {
  # stm topic proportions
  names_ <- colnames(proportions)
  frequency <-colMeans(proportions)
  order <- order(frequency, decreasing = F)
  #order <- clusters$order # from the clustering algorithm
  percentage <- frequency[order]
  
  names_ <- names_[order]
  topic <- factor(names_, levels=names_)
  combined <- data.frame(percentage = round(percentage, 3), topic) %>% 
    arrange(desc(percentage))

  
  return(as.tibble(combined))
}

props <- stm_050$theta # topic proportions
colnames(props) <- paste0('T_', as.character(1:ncol(props))) 

props_plot <- topic_props(props)

# proportions - table 2
stargazer(props_plot %>% arrange(desc(percentage)) %>% 
            mutate(percentage = round(percentage, 3),
                   topic = as.character(topic)) %>% 
            select(topic, percentage), summary = FALSE,
          row.names = FALSE)


